library(dplyr)
library(xtable)

##survey

d <- read.csv("surveydata_clean_study2.csv")
wave <- 2 
#investigate representativeness of participants left in wave 2

if(wave==2){
  #keep only respondents rows who started wave 2
  d <- d[!is.na(d$disc_treatment),]
}

#order education levels
ed_levels <- c("Grund-/folkeskole",
               "Almen gymnasial uddannelse (studentereksamen/HF)",
               "Erhvervsgymnasial uddannelse (HH/HTX/HHX)",
               "Erhvervsfaglig uddannelse",
               "Kort videregående uddannelse under 3 år",
               "Mellemlang videregående uddannelse 3-4 år",
               "Lang videregående uddannelse 5 år eller mere",
               "Forskeruddannelse (f.eks. ph.d.)")
d$education <- factor(d$profile_education, levels=ed_levels)
ed_levels_eng <- c("Elementary", "High school", "High school", "Professional",
                   "Short tertiary", "Bachelor", "Master",
                   "PhD")
levels(d$education) <- ed_levels_eng

#create own age categories
d$age_cat10 <- recode(d$profile_age5,
                      "18-22"="Age 18-32", "23-27"="Age 18-32", "28-32"="Age 18-32",
                      "33-37"="Age 33-42", "38-42"="Age 33-42",
                      "43-47"="Age 43-52", "48-52"="Age 43-52",
                      "53-57"="Age 53-62", "58-62"="Age 53-62",
                      "63-67"="Age 63-72", "68-72"="Age 63-72",
                      "73-77"="Age 73+", "78-82"="Age 73+",
                      "83+"="Age 73+")

#for each background variable, add to a proportions table
backgrcols <- c("gender","age_cat10","region","education")
proptab_svy <- data.frame()
for(bgcol in backgrcols){
  
  proptab_one <- prop.table(table(d[bgcol]))
  proptab_one <- data.frame(proptab_one)
  names(proptab_one) <- c("category", "proportion")
  proptab_svy <- rbind(proptab_svy, proptab_one)
  
}


##Danmark Statistik

#gender
gender_age <- read.csv("DK Statistik/gender_age.csv")
gender_tab <- apply(gender_age[c("Mand","Kvinde")], 2, sum)
gender_prop <- gender_tab/sum(gender_tab)

#age
age_lowlims <- c(18, seq(33, 73, 10), 125)
gender_age$age_cat <- cut(gender_age$alder, age_lowlims, include.lowest=T, right=F,
                          labels=sort(unique(d$age_cat10)))
age_tab <- tapply(gender_age$Total, gender_age$age_cat, sum)
age_prop <- age_tab/sum(age_tab)

#region
region <- read.csv("DK Statistik/region.csv")[c("region","total")]
region_prop <- region$total/sum(region$total)
names(region_prop) <- region$region

#education
education <- read.csv("DK Statistik/education.csv")[c("uddannelse","total")]
rownames(education) <- education$uddannelse

#sum H50 and H60
total_H50_H60 <- education["H50 Mellemlange videregående uddannelser, MVU","total"] +
  education["H60 Bacheloruddannelser, BACH","total"]
edu_joint <- data.frame(uddannelse=c("H50+H60 Bachelor"),
                        total=c(total_H50_H60))
education <- rbind(education, edu_joint)

#drop levels that have been summed, and H90 unknown
dropeds <- c("H50 Mellemlange videregående uddannelser, MVU",
             "H60 Bacheloruddannelser, BACH",
             "H90 Uoplyst mv.")
education <- education[!(row.names(education) %in% dropeds),]
education <- education[order(education$uddannelse),]

#get proportions
edu_prop <- education$total/sum(education$total)
names(edu_prop) <- unique(ed_levels_eng)

#add background variables into a proportions table
proptab_DK <- data.frame(category = c(names(gender_prop),
                                      names(age_prop),
                                      names(region_prop),
                                      names(edu_prop)),
                         proportion = c(gender_prop,
                                        age_prop,
                                        region_prop,
                                        edu_prop))


##merge survey and DK Statistik

proptab <- merge(proptab_svy, proptab_DK, by="category", sort = F)
proptab$percent_survey <- round(proptab$proportion.x*100, 1)
proptab$percent_population <- round(proptab$proportion.y*100, 1)
proptab$difference <- proptab$percent_survey - proptab$percent_population
proptab <- proptab[c("category","percent_survey","percent_population","difference")]

#further level translations
proptab$category <- as.character(proptab$category)
proptab$category[proptab$category == "Kvinde"] <- "Female"
proptab$category[proptab$category == "Mand"] <- "Male"
proptab$category[proptab$category == "Hovedstaden"] <- "Capital region"

#write to files
write.csv(proptab, paste0("tables/demographic_proportions2_wave", wave, ".csv"))
colnames(proptab)[2:3] <- c("pct. in survey","pct. in population")
proptabx <- xtable(proptab, digits=c(0,0,1,1,1),
                   caption = 'Comparing proportions of Study 2 respondents in demographic categories to Danish population statistics (source: Danmark Statistik, Q1 2022).',
                   label= "tab:represent2")
print(proptabx, file=paste0("tables/demographic_proportions2_wave", wave, ".tex"),
      hline.after = c(-1,0,2,8,13,20), include.rownames=F)